In pandas
we have several possibilities to read data and several possibilities to write data.
In the Datos folder you can find a file mast.txt with the following format:
130904 0000 2.21 2.58 113.5 999.99 999.99 99.99 9999.99 9999.99 0.11
130904 0010 1.69 2.31 99.9 999.99 999.99 99.99 9999.99 9999.99 0.35
130904 0020 1.28 1.50 96.0 999.99 999.99 99.99 9999.99 9999.99 0.08
130904 0030 1.94 2.39 99.2 999.99 999.99 99.99 9999.99 9999.99 0.26
130904 0040 2.17 2.67 108.4 999.99 999.99 99.99 9999.99 9999.99 0.23
130904 0050 2.25 2.89 105.0 999.99 999.99 99.99 9999.99 9999.99 0.35
...
We can read in the following manner:
In [ ]:
# First, imports
import os
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
np.random.seed(19760812)
%matplotlib inline
In [ ]:
ipath = os.path.join('Datos', 'mast.txt')
wind = pd.read_csv(ipath)
wind.head(3)
In [ ]:
wind = pd.read_csv(ipath, sep = "\s*")
# When we work with text separated by whitespaces we can use the keyword delim_whitespace:
# wind = pd.read_csv(path, delim_whitespace = True)
wind.head(3)
In [ ]:
cols = ['Date', 'time', 'wspd', 'wspd_max', 'wdir',
'x1', 'x2', 'x3', 'x4', 'x5',
'wspd_std']
wind = pd.read_csv(ipath, sep = "\s*", names = cols)
wind.head(3)
In [ ]:
cols = ['Date', 'time', 'wspd', 'wspd_max', 'wdir',
'x1', 'x2', 'x3', 'x4', 'x5',
'wspd_std']
wind = pd.read_csv(ipath, sep = "\s*", names = cols,
parse_dates = [[0, 1]])
wind.head(3)
Depending of your operative system dates can be right or not. Don't worry now about this. Later we will work on this.
In [ ]:
cols = ['Date', 'time', 'wspd', 'wspd_max', 'wdir',
'x1', 'x2', 'x3', 'x4', 'x5',
'wspd_std']
wind = pd.read_csv(ipath, sep = "\s*", names = cols,
parse_dates = [[0, 1]], index_col = 0)
wind.head(3)
In [ ]:
cols = ['Date', 'time', 'wspd', 'wspd_max', 'wdir',
'x1', 'x2', 'x3', 'x4', 'x5',
'wspd_std']
wind = pd.read_csv(ipath, sep = "\s*", names = cols,
parse_dates = {'timestamp': [0, 1]}, index_col = 0)
wind.head(3)
In [ ]:
# The previous code is equivalent to
cols = ['Date', 'time', 'wspd', 'wspd_max', 'wdir',
'x1', 'x2', 'x3', 'x4', 'x5',
'wspd_std']
wind = pd.read_csv(ipath, sep = "\s*", names = cols,
parse_dates = [[0, 1]], index_col = 0)
wind.index.name = 'Timestamp'
wind.head(3)
In [ ]:
# in the previous cell code you can change 0's and 1's on
# parse_dates and index_col with the names of the columns
# test it!!!
In [ ]:
help(pd.read_csv)
With very few lines of code we read a text file with data separated by whitespaces, we transformed two columns to have dates and that dates are now the index (we only can have one record each time),...
Nothing prevents from having repeated indexes. Take care as it cn be a source of errors.
In [ ]:
tmp = pd.DataFrame([1,10,100, 1000], index = [1,1,2,2], columns = ['values'])
In [ ]:
tmp
In [ ]:
print(tmp['values'][1], tmp['values'][2], sep = '\n')
In [ ]:
# An example with error in dates:
index = [
'01/01/2015 00:00',
'02/01/2015 00:00',
'03/01/2015 00:00',
'04/01/2015 00:00',
'05/01/2015 00:00',
'06/01/2015 00:00',
'07/01/2015 00:00',
'08/01/2015 00:00',
'09/01/2015 00:00',
'10/01/2015 00:00',
'11/01/2015 00:00',
'12/01/2015 00:00',
'13/01/2015 00:00',
'14/01/2015 00:00',
'15/01/2015 00:00'
]
values = np.random.randn(len(index))
tmp = pd.DataFrame(values, index = pd.to_datetime(index), columns = ['col1'])
In [ ]:
display(tmp)
tmp.plot.line(figsize = (12, 6))
To avoid the previous error we can write our own date parser on, for instance, pd.read_csv
:
In [ ]:
import datetime as dt
import io
def dateparser(date):
date, time = date.split()
DD, MM, YY = date.split('/')
hh, mm = time.split(':')
return dt.datetime(int(YY), int(MM), int(DD), int(hh), int(mm))
virtual_file = io.StringIO("""01/01/2015 00:00, 1
02/01/2015 00:00, 2
03/01/2015 00:00, 3
04/01/2015 00:00, 4
05/01/2015 00:00, 5
06/01/2015 00:00, 6
07/01/2015 00:00, 7
08/01/2015 00:00, 8
09/01/2015 00:00, 9
10/01/2015 00:00, 10
11/01/2015 00:00, 11
12/01/2015 00:00, 12
13/01/2015 00:00, 13
14/01/2015 00:00, 14
15/01/2015 00:00, 15
""")
tmp_wrong = pd.read_csv(virtual_file, parse_dates = [0], index_col = 0, names = ['Date', 'values'])
virtual_file = io.StringIO("""01/01/2015 00:00, 1
02/01/2015 00:00, 2
03/01/2015 00:00, 3
04/01/2015 00:00, 4
05/01/2015 00:00, 5
06/01/2015 00:00, 6
07/01/2015 00:00, 7
08/01/2015 00:00, 8
09/01/2015 00:00, 9
10/01/2015 00:00, 10
11/01/2015 00:00, 11
12/01/2015 00:00, 12
13/01/2015 00:00, 13
14/01/2015 00:00, 14
15/01/2015 00:00, 15
""")
tmp_right = pd.read_csv(virtual_file, parse_dates = True, index_col = 0, names = ['Date', 'values'],
date_parser = dateparser)
In [ ]:
display(tmp_wrong)
display(tmp_right)
In [ ]:
opath = os.path.join('Datos', 'mast_2.csv')
#wind.to_csv(opath)
wind.iloc[0:100].to_csv(opath)
In [ ]:
#wind.to_json(opath.replace('csv', 'json'))
wind.iloc[0:100].to_json(opath.replace('csv', 'json'))
In [ ]:
# Si son muchos datos no os lo recomiendo, es lento
#wind.to_html(opath.replace('csv', 'html'))
wind.iloc[0:100].to_html(opath.replace('csv', 'html'))
In [ ]:
writer = pd.ExcelWriter(opath.replace('csv', 'xlsx'))
#wind.to_excel(writer, sheet_name= "Mi hoja 1")
wind.iloc[0:100].to_excel(writer, sheet_name= "Mi hoja 1")
writer.save()
In [ ]:
# Now that we have files with json, html, xlsx,..., formats you can practice what we have learn opening them
# using the pd.read_* functions